In [2]:
import pandas as pd
import numpy as np
from scipy.stats import pearsonr
from statsmodels.sandbox.stats.multicomp import multipletests
In [3]:
sales = pd.read_csv('foodmart.sales.tsv', sep = '\t', header = 0, parse_dates = [2])
In [4]:
sales.head()
Out[4]:
In [5]:
products = pd.read_csv('foodmart.products.tsv', sep = '\t', header = 0)
In [6]:
products.head()
Out[6]:
In [7]:
sales = sales.merge(products[['product_id', 'product_name']],
on = ['product_id'], how = 'inner')
In [8]:
sales.head()
Out[8]:
In [9]:
sparse_sales = pd.pivot_table(sales, values='sales', index=['date', 'store_id'],
columns=['product_name'], fill_value = 0)
In [10]:
sparse_sales.head()
Out[10]:
In [11]:
%%time
corr_data = []
for i, lhs_column in enumerate(sparse_sales.columns):
for j, rhs_column in enumerate(sparse_sales.columns):
if i >= j:
continue
corr, p = pearsonr(sparse_sales[lhs_column], sparse_sales[rhs_column])
corr_data.append([lhs_column, rhs_column, corr, p])
In [12]:
sales_correlation = pd.DataFrame.from_records(corr_data)
sales_correlation.columns = ['product_A', 'product_B', 'corr', 'p']
In [13]:
sales_correlation.head()
Out[13]:
Сколько гипотез об отсутствии корреляции отвергается без поправки на множественную проверку?
In [14]:
(sales_correlation.p < 0.05).value_counts()
Out[14]:
In [15]:
reject, p_corrected, a1, a2 = multipletests(sales_correlation.p,
alpha = 0.05,
method = 'holm')
In [16]:
sales_correlation['p_corrected'] = p_corrected
sales_correlation['reject'] = reject
In [17]:
sales_correlation.head()
Out[17]:
In [18]:
sales_correlation.reject.value_counts()
Out[18]:
In [19]:
sales_correlation[sales_correlation.reject == True].sort_values(by='corr', ascending=False)
Out[19]:
In [20]:
reject, p_corrected, a1, a2 = multipletests(sales_correlation.p,
alpha = 0.05,
method = 'fdr_bh')
In [21]:
sales_correlation['p_corrected'] = p_corrected
sales_correlation['reject'] = reject
In [22]:
sales_correlation.head()
Out[22]:
In [23]:
sales_correlation.reject.value_counts()
Out[23]:
In [24]:
sales_correlation[sales_correlation.reject == True].sort_values(by='corr', ascending=False)
Out[24]:
In [ ]:
%%time
corr_data = []
for i, lhs_column in enumerate(sparse_sales.columns):
for j, rhs_column in enumerate(sparse_sales.columns):
if i >= j:
continue
corr, p = pearsonr(sparse_sales[lhs_column], sparse_sales[rhs_column])
prod = (sparse_sales[lhs_column]-np.mean(sparse_sales[lhs_column])) * (sparse_sales[rhs_column]-np.mean(sparse_sales[rhs_column]))
theta = np.mean((prod - np.cov(sparse_sales)[1,2])**2)
T = sum(prod) / np.sqrt(n * theta)
corr_data.append([lhs_column, rhs_column, corr, p, T])
In [48]:
sales_correlation = pd.DataFrame.from_records(corr_data)
sales_correlation.columns = ['product_A', 'product_B', 'corr', 'p', 'T']
In [ ]:
alpha = 1
k = df.shape[1]
a = 2 * np.log(np.log(k))
b = np.sqrt(4 * np.log(p) - a)